library(recommenderlab);library("ggplot2")#;library("pander")
# 2.2
##========================================##
##== Building the Recommendation Models ==##
##========================================##
#data(MovieLense)

temp=read.csv("data/MovieLense.csv")
MovieLense=as(temp,"realRatingMatrix")

##========================================##
##== Item-based Collaborative Filtering ==##
##========================================##
# Collaborative filtering is a branch of recommendation that takes account of the
# information about different users. 
# The word "collaborative" refers to the fact that users collaborate with each other
# to recommend items. 
# In fact, the algorithms take account of user purchases and preferences. 
# The starting point is a rating matrix in which rows correspond to users and columns
# correspond to items.

# This code file will show you an example of item-based CF. 
# Given a new user, IBCF considers the user's purchases and recommends similar items. 
# The core algorithm is based on these steps:
# (1) For each two items, measure how similar they are in terms of having received
#    similar ratings by similar users
# (2) For each item, identify the k-most similar items
# (3) For each user, identify the items that are most similar to the user's purchases
# In the code below, we will see the overall approach to building an IBCF model. 
# Details are illustrated below.

set.seed(1)
data.used <- MovieLense[rowCounts(MovieLense) > 50,
                             colCounts(MovieLense) > 100]
data.used_norm <- normalize(data.used)

##== Step 1. Defining the training and test sets  ==##
# We will build the model using a part of the MovieLense dataset (the training set) and
# apply it on the other part (the test set). 
# Since it's not a topic of this chapter, we will not evaluate the model,
# but will only recommend movies to the users in the test set.
# The two sets are as follows:
# (A) Training set: This set includes users from which the model learns
# (B) Test set: This set includes users to whom we recommend movies
# The algorithm automatically normalizes the data, so we can use data.used
# that contains relevant users and movies of MovieLense. 
# We defined data.used in the previous section as the subset of MovieLense users
# who have rated at least 50 movies and movies that have been rated at least 100 times.
# First, we randomly define the train.ID vector that is TRUE for users in the
# training set and FALSE for the others. 
# We will set the probability in the training set as 80 percent:
train.ID <- sample(x = c(TRUE, FALSE),
                      size = nrow(data.used),
                      replace = TRUE,
                      prob = c(0.8, 0.2))
#length(which(train.ID==TRUE))


#train.ID=as.integer(sample(unique(getData.frame(data.used)[,"user"]),0.8*560))
#subset(getData.frame(data.used),user==train.ID)




# Let's define the training and the test sets:
reccTrain  <- data.used[train.ID, ]
reccTest <- data.used[!train.ID, ]
nrow(reccTrain)+nrow(reccTest)

##== Step 2. Building the recommendation model ==##

#  The function to build models is recommender() with inputs as follows:
# (1) Data: This is the training set
# (2) Method: This is the name of the technique
# (3) Parameters: These are some optional parameters of the technique

recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")
names(recommender_models) # Show 11 Recommender models


recc.output <- Recommender(data = reccTrain ,
                          method = "IBCF",
                          parameter = list(k = 30,method=c("cosine","pearson")[1]))

recc.output
class(recc.output)

# The recc.output class is an object of the Recommender class containing the model.

##== Step 3.  Exploring the recommender model ==##
# Using getModel, we can extract some details about the model, such as its description
# and parameters:
model_details <- getModel(recc.output)
model_details$description
model_details$k

class(model_details$sim)
dim(model_details$sim)

dev.new();image(model_details$sim,
                main = "Heatmap of complete similarity matrix") # Fig 2.2-1

n_items <- sample(seq(dim(model_details$sim)[1]),20)
dev.new();image(model_details$sim[n_items, n_items],
      main = paste0("Heatmap of the randomly selected ", length(n_items)," rows and columns"))

# Most of the values are equal to 0. 
# The reason is that each row contains only k elements. 

##================================================##
## Applying the recommender model on the test set ##
##================================================##
 
n_recommended <- 7

# For each user, the algorithm extracts its rated movies. 
# For each movie, it identifies all its similar items, starting from the similarity matrix.
# Then, the algorithm ranks each similar item in this way:
# (1) Extract the user rating with this item, which is used as a weight.
# (2) Extract the similarity of the item with each purchase associated with this item.
# (3) Multiply each weight with the related similarity.
# (4) Sum everything up.
# Then, the algorithm identifies the top n recommendations:

recc_predicted <- predict(object = recc.output, 
                          newdata = reccTest,
                          n = n_recommended,
                          type=c("topNList","ratings","ratingMatrix")[1])
recc_predicted
slotNames(recc_predicted)
recc_user_1 <- recc_predicted@items[[1]]
movies_user_1 <- recc_predicted@itemLabels[recc_user_1]
movies_user_1

users_train=getData.frame(reccTrain)[,"user"]
users_test=getData.frame(reccTest)[,"user"]
unique(users_train)
unique(users_test)
length(unique(users_train))
length(unique(users_test))

RECC.test=sapply(recc_predicted@items,unlist)
dim(RECC.test)
colnames(RECC.test)=unique(users_test)



##========================================##
##== User-based Collaborative Filtering ==##
##========================================##

# For IBCF, the algorithm was based on items and the steps to identify
# recommendations were as follows:
# • Identify similar items that are purchased by the same people
# • Recommend to a new user the items that are similar to its purchases

# For UBCF, we will use the opposite approach. First, given a new user, we will
# identify its similar users. Then, we will recommend the top-rated items purchased by
# similar users. This approach is called user-based collaborative filtering. 

# For each new user, these are the steps:
# 1. Measure how similar each user is to the new one: cosin and pearson.
# 2. Identify the most similar users. The options are:
#   (2.1) Take account of the top k users (k-nearest_neighbors)
#   (2.2) Take account of the users whose similarity is above a defined threshold
# 3. Rate the items purchased by the most similar users. The rating is the average
# rating among similar users and the approaches are:
#   (3.1) Average rating
#   (3.2) Weighted average rating, using the similarities as weights
# 4. Pick the top-rated items.
# Like we did in the previous chapter, we will build a training and a test set.

# Now, we can start building the model directly.
recommender_models <- recommenderRegistry$get_entries(dataType = "realRatingMatrix")

recommender_models$UBCF_realRatingMatrix$parameters

df_parameters <- data.frame(
  parameter = names(recommender_models$UBCF_realRatingMatrix$parameters),
  default = unlist(recommender_models$UBCF_realRatingMatrix$parameters)
)
rownames(df_parameters) <- NULL
pander(head(df_parameters))

#Some relevant parameters are:
# (1) method: This shows how to compute the similarity between users
# (2) nn: This shows the number of similar users
# Let's build a recommender model leaving the parameters to their defaults:

recc.output <- Recommender(data = reccTrain, method = "UBCF")
recc.output

# Let's extract some details about the model using getModel:
model_details <- getModel(recc.output)

# Let's take a look at the components of the model:	
names(model_details)

pander(data.frame(element = names(model_details)))

# Apart from the description and parameters of model, model_details contains a
# data slot:
model_details$data

# The model_details$data object contains the rating matrix. The reason is that UBCF
# is a lazy-learning technique, which means that it needs to access all the data to
# perform a prediction.

##================================================##
## Applying the recommender model on the test set ##
##================================================##
# Like IBCF, we determine the top 6 recommendations for each new user:
n_recommended <- 7
recc_predicted <- predict(object = recc.output, 
                          newdata = reccTest,
                          n = n_recommended)
recc_predicted

# We can define a matrix with the recommendations to the test set users:
recc_matrix <- sapply(recc_predicted@items, function(x){
  colnames(ratings_movies)[x]
})
dim(recc_matrix)

# Let's take a look at the first four users:	
## recc_matrix[, 1:4]

pander(recc_matrix[, 1:4])


# We can also compute how many times each movie got recommended and build the
# related frequency histogram. 
# The following image displays the distribution of the numbers of items for UBCF:
number_of_items <- factor(table(recc_matrix))
dev.new(); qplot(number_of_items) + 
  ggtitle("Distribution of the number of items for UBCF")

# Compared with the IBCF, the distribution has a longer tail. This means that there are
# some movies that are recommended much more often than the others. The maximum
# is 29, compared with 11 for IBCF.

# Let's take a look at the top titles:
number_of_items_sorted <- sort(number_of_items, decreasing = TRUE)
number_of_items_top <- head(number_of_items_sorted, n = 4)
table_top <- data.frame(
  names(number_of_items_top),
  number_of_items_top)

table_top

pander(table_top)

# Comparing the results of UBCF with IBCF helps in understanding the algorithm
# better. 
# (1) UBCF needs to access the initial data, so it is a lazy-learning model. 
# Since it needs to keep the entire database in memory, it doesn't work well for big rating matrix. 
# (2) Also, building the similarity matrix requires a lot of computing power and time.

# However, UBCF's accuracy is proven to be slightly more accurate than IBCF, so it's a
# good option if the dataset is not too big.


